/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.analysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import java.io.*; import java.util.*; import java.util.logging.Logger; import net.nutch.util.*; import net.nutch.searcher.Query.*; /** Construct n-grams for frequently occuring terms and phrases while indexing. * Optimize phrase queries to use the n-grams. Single terms are still indexed * too, with n-grams overlaid. This is achieved through the use of {@link * Token#setPositionIncrement(int)}.*/ public class CommonGrams { private static final Logger LOG = LogFormatter.getLogger("net.nutch.analysis.CommonGrams"); private static final char SEPARATOR = '-'; private static final HashMap COMMON_TERMS = new HashMap(); static { init(); } private CommonGrams() {} // no public ctor private static class Filter extends TokenFilter { private HashSet common; private Token previous; private LinkedList gramQueue = new LinkedList(); private LinkedList nextQueue = new LinkedList(); private StringBuffer buffer = new StringBuffer(); /** Construct an n-gram producing filter. */ public Filter(TokenStream input, HashSet common) { super(input); this.common = common; } /** Inserts n-grams into a token stream. */ public Token next() throws IOException { if (gramQueue.size() != 0) // consume any queued tokens return (Token)gramQueue.removeFirst(); final Token token = popNext(); if (token == null) return null; if (!isCommon(token)) { // optimize simple case previous = token; return token; } gramQueue.add(token); // queue the token ListIterator i = nextQueue.listIterator(); Token gram = token; while (isCommon(gram)) { if (previous != null && !isCommon(previous)) // queue prev gram first gramQueue.addFirst(gramToken(previous, gram)); Token next = peekNext(i); if (next == null) break; gram = gramToken(gram, next); // queue next gram last gramQueue.addLast(gram); } previous = token; return (Token)gramQueue.removeFirst(); } /** True iff token is for a common term. */ private boolean isCommon(Token token) { return common != null && common.contains(token.termText()); } /** Pops nextQueue or, if empty, reads a new token. */ private Token popNext() throws IOException { if (nextQueue.size() > 0) return (Token)nextQueue.removeFirst(); else return input.next(); } /** Return next token in nextQueue, extending it when empty. */ private Token peekNext(ListIterator i) throws IOException { if (!i.hasNext()) { Token next = input.next(); if (next == null) return null; i.add(next); i.previous(); } return (Token)i.next(); } /** Construct a compound token. */ private Token gramToken(Token first, Token second) { buffer.setLength(0); buffer.append(first.termText()); buffer.append(SEPARATOR); buffer.append(second.termText()); Token result = new Token(buffer.toString(), first.startOffset(), second.endOffset(), "gram"); result.setPositionIncrement(0); return result; } } /** Construct using the provided config file. */ private static void init() { try { Reader reader = NutchConf.getConfResourceAsReader (NutchConf.get("analysis.common.terms.file")); BufferedReader in = new BufferedReader(reader); String line; while ((line = in.readLine()) != null) { line = line.trim(); if (line.startsWith("#") || "".equals(line)) // skip comments continue; TokenStream ts = new NutchDocumentTokenizer(new StringReader(line)); Token token = ts.next(); if (token == null) { LOG.warning("Line does not contain a field name: " + line); continue; } String field = token.termText(); token = ts.next(); if (token == null) { LOG.warning("Line contains only a field name, no word: " + line); continue; } String gram = token.termText(); while ((token = ts.next()) != null) { gram = gram + SEPARATOR + token.termText(); } HashSet table = (HashSet)COMMON_TERMS.get(field); if (table == null) { table = new HashSet(); COMMON_TERMS.put(field, table); } table.add(gram); } } catch (IOException e) { throw new RuntimeException(e.toString()); } } /** Construct a token filter that inserts n-grams for common terms. For use * while indexing documents. */ public static TokenFilter getFilter(TokenStream ts, String field) { return new Filter(ts, (HashSet)COMMON_TERMS.get(field)); } /** Utility to convert an array of Query.Terms into a token stream. */ private static class ArrayTokens extends TokenStream { private Term[] terms; private int index; public ArrayTokens(Phrase phrase) { this.terms = phrase.getTerms(); } public Token next() { if (index == terms.length) return null; else return new Token(terms[index].toString(), index, ++index); } } /** Optimizes phrase queries to use n-grams when possible. */ public static String[] optimizePhrase(Phrase phrase, String field) { //LOG.info("Optimizing " + phrase + " for " + field); ArrayList result = new ArrayList(); TokenStream ts = getFilter(new ArrayTokens(phrase), field); Token token, prev=null; int position = 0; try { while ((token = ts.next()) != null) { if (token.getPositionIncrement() != 0 && prev != null) result.add(prev.termText()); prev = token; position += token.getPositionIncrement(); if ((position + arity(token.termText())) == phrase.getTerms().length) break; } } catch (IOException e) { throw new RuntimeException(e.toString()); } if (prev != null) result.add(prev.termText()); // LOG.info("Optimized: "); // for (int i = 0; i < result.size(); i++) { // LOG.info(result.get(i) + " "); // } return (String[])result.toArray(new String[result.size()]); } private static int arity(String gram) { int index = 0; int arity = 0; while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) { arity++; } return arity; } /** For debugging. */ public static void main(String[] args) throws Exception { StringBuffer text = new StringBuffer(); for (int i = 0; i < args.length; i++) { text.append(args[i]); text.append(' '); } TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString())); ts = getFilter(ts, "url"); Token token; while ((token = ts.next()) != null) { System.out.println("Token: " + token); } String[] optimized = optimizePhrase(new Phrase(args), "url"); System.out.print("Optimized: "); for (int i = 0; i < optimized.length; i++) { System.out.print(optimized[i] + " "); } System.out.println(); } }